Initial checkin of parallax code.
See README-PARALLAX for details.
Signed-off-by: andrew.warfield@cl.cam.ac.uk
40e1b09db5mN69Ijj0X_Eol-S7dXiw tools/Rules.mk
4209033eUwhDBJ_bxejiv5c6gjXS4A tools/blktap/Makefile
4209033ewLAHdhGrT_2jo3Gb_5bDcA tools/blktap/README
+42277b02mYXxgijE7MFeUe9d8eldMw tools/blktap/README-PARALLAX
4209033eX_Xw94wHaOCtnU9nOAtSJA tools/blktap/blkaio.c
4209033egwf6LDxM2hbaqi9rRdZy4A tools/blktap/blkaiolib.c
4209033f9yELLK85Ipo2oKjr3ickgQ tools/blktap/blkaiolib.h
42090340_mvZtozMjghPJO0qsjk4NQ tools/blktap/blkint.h
42090340rc2q1wmlGn6HtiJAkqhtNQ tools/blktap/blktaplib.c
42090340C-WkRPT7N3t-8Lzehzogdw tools/blktap/blktaplib.h
+42277b02WrfP1meTDPv1M5swFq8oHQ tools/blktap/blockstore.c
+42277b02P1C0FYj3gqwTZUD8sxKCug tools/blktap/blockstore.h
42090340B3mDvcxvd9ehDHUkg46hvw tools/blktap/libgnbd/Makefile
42090340ZWkc5Xhf9lpQmDON8HJXww tools/blktap/libgnbd/gnbdtest.c
42090340ocMiUScJE3OpY7QNunvSbg tools/blktap/libgnbd/libgnbd.c
42090340G5_F_EeVnPORKB0pTMGGhA tools/blktap/libgnbd/libgnbd.h
+42277b03930x2TJT3PZlw6o0GERXpw tools/blktap/parallax.c
+42277b03XQYq8bujXSz7JAZ8N7j_pA tools/blktap/radix.c
+42277b03vZ4-jno_mgKmAcCW3ycRAg tools/blktap/radix.h
+42277b03U_wLHL-alMA0bfxGlqldXg tools/blktap/snaplog.c
+42277b04Ryya-z662BEx8HnxNN0dGQ tools/blktap/snaplog.h
+42277b04LxFjptgZ75Z98DUAso4Prg tools/blktap/vdi.c
+42277b04tt5QkIvs8She8CQqH5kwpg tools/blktap/vdi.h
+42277b04zMAhB0_946sHQ_H2vwnt0Q tools/blktap/vdi_create.c
+42277b04xB_iUmiSm6nKcy8OV8bckA tools/blktap/vdi_fill.c
+42277b045CJGD_rKH-ZT_-0X4knhWA tools/blktap/vdi_list.c
+42277b043ZKx0NJSbcgptQctQ5rerg tools/blktap/vdi_snap.c
+42277b043Fjy5-H7LyBtUPyDlZFo6A tools/blktap/vdi_snap_list.c
+42277b04vhqD6Lq3WmGbaESoAAKdhw tools/blktap/vdi_tree.c
+42277b047H8fTVyUf75BWAjh6Zpsqg tools/blktap/vdi_validate.c
4124b307nRyK3dhn1hAsvrY76NuV3g tools/check/Makefile
4124b307vHLUWbfpemVefmaWDcdfag tools/check/README
4124b307jt7T3CHysgl9LijNHSe1tA tools/check/check_brctl
XEN_ROOT = ../..
include $(XEN_ROOT)/tools/Rules.mk
+BLKTAP_INSTALL_DIR = /usr/sbin
+
+INSTALL = install
+INSTALL_PROG = $(INSTALL) -m0755
+INSTALL_DIR = $(INSTALL) -d -m0755
+
INCLUDES +=
SRCS :=
SRCS += blktaplib.c
+PLX_SRCS :=
+PLX_SRCS += vdi.c
+PLX_SRCS += radix.c
+PLX_SRCS += blockstore.c
+PLX_SRCS += snaplog.c
+VDI_SRCS := $(PLX_SRCS)
+PLX_SRCS += parallax.c
+
+VDI_TOOLS :=
+VDI_TOOLS += vdi_create
+VDI_TOOLS += vdi_list
+VDI_TOOLS += vdi_snap
+VDI_TOOLS += vdi_snap_list
+VDI_TOOLS += vdi_fill
+VDI_TOOLS += vdi_tree
+VDI_TOOLS += vdi_validate
+
CFLAGS += -Wall
CFLAGS += -Werror
CFLAGS += -Wno-unused
LIB = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR)
-all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd blkaio
+all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(VDI_TOOLS) parallax
$(MAKE) $(LIB)
LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse)
ln -sf ../../$(LINUX_ROOT)/include/asm-xen/linux-public/*.h . )
install: all
- mkdir -p $(prefix)/usr/lib
- mkdir -p $(prefix)/usr/include
- install -m0755 $(LIB) $(prefix)/usr/lib
- ln -sf libblktap.so.$(MAJOR).$(MINOR) \
- $(prefix)/usr/lib/libblktap.so.$(MAJOR)
- ln -sf libblktap.so.$(MAJOR) $(prefix)/usr/lib/libblktap.so
- install -m0644 blktaplib.h $(prefix)/usr/include
+ $(INSTALL_DIR) -p $(DESTDIR)/usr/lib
+ $(INSTALL_DIR) -p $(DESTDIR)/usr/include
+ $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/lib
+ $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include
+ $(INSTALL_PROG) blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(DESTDIR)/$(BLKTAP_INSTALL_DIR)
clean:
- rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd blkaio
+ rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd blkaio $(VDI_TOOLS) parallax
rpm: all
rm -rf staging
blkaio: $(LIB) blkaio.c blkaiolib.c
$(CC) $(CFLAGS) -o blkaio -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkaio.c blkaiolib.c -laio -lpthread
+parallax: $(LIB) $(PLX_SRCS)
+ $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap $(PLX_SRCS) libgnbd/libgnbd.a
+
+vdi_test: $(LIB) $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_test -DVDI_STANDALONE $(VDI_SRCS)
+
+vdi_list: $(LIB) vdi_list.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c $(VDI_SRCS)
+
+vdi_create: $(LIB) vdi_create.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_create vdi_create.c $(VDI_SRCS)
+
+vdi_snap: $(LIB) vdi_snap.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_snap vdi_snap.c $(VDI_SRCS)
+
+vdi_snap_list: $(LIB) vdi_snap_list.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_snap_list vdi_snap_list.c $(VDI_SRCS)
+
+vdi_tree: $(LIB) vdi_tree.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_tree vdi_tree.c $(VDI_SRCS)
+
+vdi_fill: $(LIB) vdi_fill.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_fill vdi_fill.c $(VDI_SRCS)
+
+vdi_validate: $(LIB) vdi_validate.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o vdi_validate vdi_validate.c $(VDI_SRCS)
+
+
+rdx_cmp: $(LIB) rdx_cmp.c $(VDI_SRCS)
+ $(CC) $(CFLAGS) -g3 -o rdx_cmp rdx_cmp.c $(VDI_SRCS)
+
+
.PHONY: TAGS clean install mk-symlinks rpm
TAGS:
etags -t $(SRCS) *.h
--- /dev/null
+Parallax Quick Overview
+March 3, 2005
+
+This is intended to provide a quick set of instructions to let you
+guys play with the current parallax source. In it's current form, the
+code will let you run an arbitrary number of VMs off of a single disk
+image, doing copy-on-write as they make updates. Each domain is
+assigned a virtual disk image (VDI), which may be based on a snapshot
+of an existing image. All of the VDI and snapshot management should
+currently work.
+
+The current implementation uses a single file as a blockstore for
+_everything_ this will soon be replaced by the fancier backend code
+and the local cache. As it stands, Parallax will create
+"blockstore.dat" in the directory that you run it from, and use
+largefile support to make this grow to unfathomable girth. So, you
+probably want to run the daemon off of a local disk, with a lot of
+free space.
+
+Here's how to get going:
+
+0. Setup:
+---------
+
+Pick a local directory on a disk with lots of room. You should be
+running from a privileged domain (e.g. dom0) with the blocktap
+configured in and block backend NOT.
+
+For convenience (for the moment) copy all of the vdi tools (vdi_*) and
+the parallax daemon from tools/blktap into this directory.
+
+1. Populate the blockstore:
+---------------------------
+
+First you need to put at least one image into the blockstore. You
+will need a disk image, either as a file or local partition. My
+general approach has been to
+
+(a) make a really big sparse file with
+
+ dd if=/dev/zero of=./image bs=4K count=1 seek=[big value]
+
+(b) put a filesystem into it
+
+ mkfs.ext3 ./image
+
+(c) mount it using loopback
+
+ mkdir ./mnt
+ mount -o loop ./image
+
+(d) cd into it and untar one of the image files from srg-roots.
+
+ cd mnt
+ tar ...
+
+NOTE: Beware if your system is FC3. mkfs is not compatible with old
+versions of fedora, and so you don't have much choice but to install
+further fc3 images if you have used the fc3 version of mkfs.
+
+(e) unmount the image
+
+ cd ..
+ umount mnt
+
+(f) now, create a new VDI to hold the image
+
+ ./vdi_create "My new FC3 VDI"
+
+(g) get the id of the new VDI.
+
+ ./vdi_list
+
+ | 0 My new FC3 VDI
+
+(0 is the VDI id... create a few more if you want.)
+
+(h) hoover your image into the new VDI.
+
+ ./vdi_fill 0 ./image
+
+This will pull the entire image into the blockstore and set up a
+mapping tree for it for VDI 0. Passing a device (i.e. /dev/sda3)
+should also work, but vdi_fill has NO notion of sparseness yet, so you
+are going to pump a block into the store for each block you read.
+
+vdi_fill will count up until it is done, and you should be ready to
+go. If you want to be anal, you can use vdi_validate to test the VDI
+against the original image.
+
+2. Create some extra VDIs
+-------------------------
+
+VDIs are actually a list of snapshots, and each snapshot is a full
+image of mappings. So, to preserve an immutable copy of a current
+VDI, do this:
+
+(a) Snapshot your new VDI.
+
+ ./vdi_snap 0
+
+Snapshotting writes the current radix root to the VDI's snapshot log,
+and assigns it a new writable root.
+
+(b) look at the VDI's snapshot log.
+
+ ./vdi_snap_list 0
+
+ | 16 0 Thu Mar 3 19:27:48 2005 565111 31
+
+The first two columns constitute a snapshot id and represent the
+(block, offset) of the snapshot record. The Date tells you when the
+snapshot was made, and 31 is the radix root node of the snapshot.
+
+(c) Create a new VDI, based on that snapshot, and look at the list.
+
+ ./vdi_create "FC3 - Copy 1" 16 0
+ ./vdi_list
+
+ | 0 My new FC3 VDI
+ | 1 FC3 - Copy 1
+
+NOTE: If you have Graphviz installed on your system, you can use
+vdi_tree to generate a postscript of your current set of VDIs and
+snapshots.
+
+
+Create as many VDIs as you need for the VMs that you want to run.
+
+3. Boot some VMs:
+-----------------
+
+Parallax currently uses a hack in xend to pass the VDI id, you need to
+modify the disk line of the VM config that is going to mount it.
+
+(a) set up your vm config, by using the following disk line:
+
+ disk = ['parallax:1,sda1,w,0' ]
+
+This example uses VDI 1 (from vdi_list above), presents it as sda1
+(writable), and uses dom 0 as the backend. If you were running the
+daemon (and tap driver) in some domain other than 0, you would change
+this last parameter.
+
+NOTE: You'll need to have reinstalled xend/tools prior to booting the vm, so that it knows what to do with "parallax:".
+
+(b) Run parallax in the backend domain.
+
+ ./parallax
+
+(c) create your new domain.
+
+ xm create ...
+
+---
+
+That's pretty much all there is to it at the moment. Hope this is
+clear enough to get you going. Now, a few serious caveats that will
+be sorted out in the almost immediate future:
+
+WARNINGS:
+---------
+
+1. There is NO locking in the VDI tools at the moment, so I'd avoid
+running them in parallel, or more importantly, running them while the
+daemon is running.
+
+2. I doubt that xend will be very happy about restarting if you have
+parallax-using domains. So if it dies while there are active parallax
+doms, you may need to reboot.
+
+3. I've turned off write-in-place. So at the moment, EVERY block
+write is a log append on the blockstore. I've been having some probs
+with the radix tree's marking of writable blocks after snapshots and
+will sort this out very soon.
+
+
#ifndef __BLKTAPLIB_H__
#define __BLKTAPLIB_H__
+#ifndef __SHORT_INT_TYPES__
+#define __SHORT_INT_TYPES__
+
#include <stdint.h>
typedef uint8_t u8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
+
+#endif /* __SHORT_INT_TYPES__ */
#if defined(__i386__)
#define rmb() __asm__ __volatile__ ( "lock; addl $0,0(%%esp)" : : : "memory" )
--- /dev/null
+/**************************************************************************
+ *
+ * blockstore.c
+ *
+ * Simple block store interface
+ *
+ */
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "blockstore.h"
+
+static int block_fp = -1;
+
+/**
+ * readblock: read a block from disk
+ * @id: block id to read
+ *
+ * @return: pointer to block, NULL on error
+ */
+
+void *readblock(u64 id) {
+ void *block;
+ if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+ printf ("%Ld\n", (id - 1) * BLOCK_SIZE);
+ perror("readblock lseek");
+ return NULL;
+ }
+ if ((block = malloc(BLOCK_SIZE)) == NULL) {
+ perror("readblock malloc");
+ return NULL;
+ }
+ if (read(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+ perror("readblock read");
+ free(block);
+ return NULL;
+ }
+ return block;
+}
+
+/**
+ * writeblock: write an existing block to disk
+ * @id: block id
+ * @block: pointer to block
+ *
+ * @return: zero on success, -1 on failure
+ */
+int writeblock(u64 id, void *block) {
+ if (lseek64(block_fp, ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) {
+ perror("writeblock lseek");
+ return -1;
+ }
+ if (write(block_fp, block, BLOCK_SIZE) < 0) {
+ perror("writeblock write");
+ return -1;
+ }
+ return 0;
+}
+
+/**
+ * allocblock: write a new block to disk
+ * @block: pointer to block
+ *
+ * @return: new id of block on disk
+ */
+static u64 lastblock = 0;
+
+u64 allocblock(void *block) {
+ u64 lb;
+ off64_t pos = lseek64(block_fp, 0, SEEK_END);
+ if (pos == (off64_t)-1) {
+ perror("allocblock lseek");
+ return 0;
+ }
+ if (pos % BLOCK_SIZE != 0) {
+ fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE);
+ return 0;
+ }
+ if (write(block_fp, block, BLOCK_SIZE) != BLOCK_SIZE) {
+ perror("allocblock write");
+ return 0;
+ }
+ lb = pos / BLOCK_SIZE + 1;
+
+ if (lb <= lastblock)
+ printf("[*** %Ld alredy allocated! ***]\n", lb);
+
+ lastblock = lb;
+ return lb;
+}
+
+
+/**
+ * newblock: get a new in-memory block set to zeros
+ *
+ * @return: pointer to new block, NULL on error
+ */
+void *newblock() {
+ void *block = malloc(BLOCK_SIZE);
+ if (block == NULL) {
+ perror("newblock");
+ return NULL;
+ }
+ memset(block, 0, BLOCK_SIZE);
+ return block;
+}
+
+
+/**
+ * freeblock: unallocate an in-memory block
+ * @id: block id (zero if this is only in-memory)
+ * @block: block to be freed
+ */
+void freeblock(void *block) {
+ if (block != NULL)
+ free(block);
+}
+
+
+int __init_blockstore(void)
+{
+ block_fp = open("blockstore.dat", O_RDWR | O_CREAT | O_LARGEFILE, 0644);
+
+ if (block_fp < 0) {
+ perror("open");
+ return -1;
+ }
+
+ return 0;
+}
--- /dev/null
+/**************************************************************************
+ *
+ * blockstore.h
+ *
+ * Simple block store interface
+ *
+ */
+
+#ifndef __BLOCKSTORE_H__
+#define __BLOCKSTORE_H__
+
+#ifndef __SHORT_INT_TYPES__
+#define __SHORT_INT_TYPES__
+
+#include <stdint.h>
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+#endif /* __SHORT_INT_TYPES__ */
+
+#define BLOCK_SIZE 4096
+#define BLOCK_SHIFT 12
+#define BLOCK_MASK 0xfffffffffffff000LL
+
+/* XXX SMH: where is the below supposed to be defined???? */
+#ifndef SECTOR_SHIFT
+#define SECTOR_SHIFT 9
+#endif
+
+
+extern void *newblock();
+extern void *readblock(u64 id);
+extern u64 allocblock(void *block);
+extern int writeblock(u64 id, void *block);
+extern void freeblock(void *block);
+extern int __init_blockstore(void);
+
+#endif /* __BLOCKSTORE_H__ */
--- /dev/null
+/**************************************************************************
+ *
+ * parallax.c
+ *
+ * The Parallax Storage Server
+ *
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "blktaplib.h"
+#include "blockstore.h"
+#include "vdi.h"
+
+#define PARALLAX_DEV 61440
+
+#if 1
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* ------[ session records ]----------------------------------------------- */
+
+#define BLKIF_HASHSZ 1024
+#define BLKIF_HASH(_d,_h) (((int)(_d)^(int)(_h))&(BLKIF_HASHSZ-1))
+
+#define VDI_HASHSZ 16
+#define VDI_HASH(_vd) ((((_vd)>>8)^(_vd))&(VDI_HASHSZ-1))
+
+typedef struct blkif {
+ domid_t domid;
+ unsigned int handle;
+ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+ vdi_t *vdi_hash[VDI_HASHSZ];
+ struct blkif *hash_next;
+} blkif_t;
+
+static blkif_t *blkif_hash[BLKIF_HASHSZ];
+
+blkif_t *blkif_find_by_handle(domid_t domid, unsigned int handle)
+{
+ if ( handle != 0 )
+ printf("blktap/parallax don't currently support non-0 dev handles!\n");
+
+ blkif_t *blkif = blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( (blkif != NULL) &&
+ ((blkif->domid != domid) || (blkif->handle != handle)) )
+ blkif = blkif->hash_next;
+ return blkif;
+}
+
+vdi_t *blkif_get_vdi(blkif_t *blkif, blkif_vdev_t device)
+{
+ vdi_t *vdi = blkif->vdi_hash[VDI_HASH(device)];
+
+ while ((vdi != NULL) && (vdi->vdevice != device))
+ vdi = vdi->next;
+
+ return vdi;
+}
+
+/* ------[ control message handling ]-------------------------------------- */
+
+void blkif_create(blkif_be_create_t *create)
+{
+ domid_t domid = create->domid;
+ unsigned int handle = create->blkif_handle;
+ blkif_t **pblkif, *blkif;
+
+ DPRINTF("parallax (blkif_create): create is %p\n", create);
+
+ if ( (blkif = (blkif_t *)malloc(sizeof(blkif_t))) == NULL )
+ {
+ DPRINTF("Could not create blkif: out of memory\n");
+ create->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+ return;
+ }
+
+ memset(blkif, 0, sizeof(*blkif));
+ blkif->domid = domid;
+ blkif->handle = handle;
+ blkif->status = DISCONNECTED;
+/*
+ spin_lock_init(&blkif->vbd_lock);
+ spin_lock_init(&blkif->blk_ring_lock);
+ atomic_set(&blkif->refcnt, 0);
+*/
+ pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( *pblkif != NULL )
+ {
+ if ( ((*pblkif)->domid == domid) && ((*pblkif)->handle == handle) )
+ {
+ DPRINTF("Could not create blkif: already exists\n");
+ create->status = BLKIF_BE_STATUS_INTERFACE_EXISTS;
+ free(blkif);
+ return;
+ }
+ pblkif = &(*pblkif)->hash_next;
+ }
+
+ blkif->hash_next = *pblkif;
+ *pblkif = blkif;
+
+ DPRINTF("Successfully created blkif\n");
+ create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_destroy(blkif_be_destroy_t *destroy)
+{
+ domid_t domid = destroy->domid;
+ unsigned int handle = destroy->blkif_handle;
+ blkif_t **pblkif, *blkif;
+
+ DPRINTF("parallax (blkif_destroy): destroy is %p\n", destroy);
+
+ pblkif = &blkif_hash[BLKIF_HASH(domid, handle)];
+ while ( (blkif = *pblkif) != NULL )
+ {
+ if ( (blkif->domid == domid) && (blkif->handle == handle) )
+ {
+ if ( blkif->status != DISCONNECTED )
+ goto still_connected;
+ goto destroy;
+ }
+ pblkif = &blkif->hash_next;
+ }
+
+ destroy->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+
+ still_connected:
+ destroy->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+ return;
+
+ destroy:
+ *pblkif = blkif->hash_next;
+ /* destroy_all_vbds(blkif); */
+ free(blkif);
+ destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void vbd_grow(blkif_be_vbd_grow_t *grow)
+{
+ blkif_t *blkif;
+ vdi_t *vdi, **vdip;
+ blkif_vdev_t vdevice = grow->vdevice;
+
+ DPRINTF("parallax (vbd_grow): grow=%p\n", grow);
+
+ blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle);
+ if ( blkif == NULL )
+ {
+ DPRINTF("vbd_grow attempted for non-existent blkif (%u,%u)\n",
+ grow->domid, grow->blkif_handle);
+ grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND;
+ return;
+ }
+
+ /* VDI identifier is in grow->extent.sector_start */
+ DPRINTF("vbd_grow: grow->extent.sector_start (id) is %llx\n",
+ grow->extent.sector_start);
+
+ vdi = vdi_get(grow->extent.sector_start);
+ if (vdi == NULL)
+ {
+ printf("parallax (vbd_grow): VDI %llx not found.\n",
+ grow->extent.sector_start);
+ grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND;
+ return;
+ }
+
+ vdi->next = NULL;
+ vdi->vdevice = vdevice;
+ vdip = &blkif->vdi_hash[VDI_HASH(vdevice)];
+ while (*vdip != NULL)
+ vdip = &(*vdip)->next;
+ *vdip = vdi;
+
+ DPRINTF("vbd_grow: happy return!\n");
+ grow->status = BLKIF_BE_STATUS_OKAY;
+}
+
+int parallax_control(control_msg_t *msg)
+{
+ domid_t domid;
+ int ret;
+
+ DPRINTF("parallax_control: msg is %p\n", msg);
+
+ if (msg->type != CMSG_BLKIF_BE)
+ {
+ printf("Unexpected control message (%d)\n", msg->type);
+ return 0;
+ }
+
+ switch(msg->subtype)
+ {
+ case CMSG_BLKIF_BE_CREATE:
+ if ( msg->length != sizeof(blkif_be_create_t) )
+ goto parse_error;
+ blkif_create((blkif_be_create_t *)msg->msg);
+ break;
+
+ case CMSG_BLKIF_BE_DESTROY:
+ if ( msg->length != sizeof(blkif_be_destroy_t) )
+ goto parse_error;
+ blkif_destroy((blkif_be_destroy_t *)msg->msg);
+ break;
+
+ case CMSG_BLKIF_BE_VBD_GROW:
+ if ( msg->length != sizeof(blkif_be_vbd_grow_t) )
+ goto parse_error;
+ vbd_grow((blkif_be_vbd_grow_t *)msg->msg);
+ break;
+ }
+ return 0;
+parse_error:
+ printf("Bad control message!\n");
+ return 0;
+
+}
+
+int parallax_probe(blkif_request_t *req, blkif_t *blkif)
+{
+ blkif_response_t *rsp;
+ vdisk_t *img_info;
+ vdi_t *vdi;
+ int i, nr_vdis = 0;
+
+ DPRINTF("parallax_probe: req=%p, blkif=%p\n", req, blkif);
+
+ /* We expect one buffer only. */
+ if ( req->nr_segments != 1 )
+ goto err;
+
+ /* Make sure the buffer is page-sized. */
+ if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) ||
+ (blkif_last_sect (req->frame_and_sects[0]) != 7) )
+ goto err;
+
+ /* fill the list of devices */
+ for (i=0; i<VDI_HASHSZ; i++) {
+ vdi = blkif->vdi_hash[i];
+ while (vdi) {
+ img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0);
+ img_info[nr_vdis].device = vdi->vdevice;
+ img_info[nr_vdis].info = VDISK_TYPE_DISK | VDISK_FLAG_VIRT;
+ /* The -2 here accounts for the LSB in the radix tree */
+ img_info[nr_vdis].capacity =
+ ((1LL << (VDI_HEIGHT-2)) >> SECTOR_SHIFT);
+ nr_vdis++;
+ vdi = vdi->next;
+ }
+ }
+
+
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_PROBE;
+ rsp->status = nr_vdis; /* number of disks */
+
+ DPRINTF("parallax_probe: send positive response (nr_vdis=%d)\n", nr_vdis);
+ return BLKTAP_RESPOND;
+err:
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_PROBE;
+ rsp->status = BLKIF_RSP_ERROR;
+
+ DPRINTF("parallax_probe: send error response\n");
+ return BLKTAP_RESPOND;
+}
+
+int parallax_read(blkif_request_t *req, blkif_t *blkif)
+{
+ blkif_response_t *rsp;
+ unsigned long size, offset, start;
+ u64 sector;
+ u64 vblock, gblock;
+ vdi_t *vdi;
+ int i;
+ char *dpage, *spage;
+
+ vdi = blkif_get_vdi(blkif, req->device);
+
+ if ( vdi == NULL )
+ goto err;
+
+ for (i = 0; i < req->nr_segments; i++) {
+
+ dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+
+ /* Round the requested segment to a block address. */
+
+ sector = req->sector_number + (8*i);
+ vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
+
+ /* Get that block from the store. */
+
+ gblock = vdi_lookup_block(vdi, vblock, NULL);
+
+ /* Calculate read size and offset within the read block. */
+
+ offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
+ size = ( blkif_last_sect (req->frame_and_sects[i]) -
+ blkif_first_sect(req->frame_and_sects[i]) + 1
+ ) << SECTOR_SHIFT;
+ start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+
+ /* If the block does not exist in the store, return zeros. */
+ /* Otherwise, copy that region to the guest page. */
+
+ DPRINTF("ParallaxRead: sect: %lld (%ld,%ld), "
+ "vblock %llx, gblock %llx, "
+ "size %lx\n",
+ sector, blkif_first_sect(req->frame_and_sects[i]),
+ blkif_last_sect (req->frame_and_sects[i]),
+ vblock, gblock, size);
+
+ if ( gblock == 0 ) {
+
+ memset(dpage + start, '\0', size);
+
+ } else {
+
+ spage = readblock(gblock);
+
+ if (spage == NULL) {
+ printf("Error reading gblock from store: %Ld\n", gblock);
+ goto err;
+ }
+
+ memcpy(dpage + start, spage + offset, size);
+
+ freeblock(spage);
+ }
+
+ }
+
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_WRITE;
+ rsp->status = BLKIF_RSP_OKAY;
+
+ return BLKTAP_RESPOND;
+err:
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_WRITE;
+ rsp->status = BLKIF_RSP_ERROR;
+
+ return BLKTAP_RESPOND;
+}
+
+int parallax_write(blkif_request_t *req, blkif_t *blkif)
+{
+ blkif_response_t *rsp;
+ u64 sector;
+ int i, writable = 0;
+ u64 vblock, gblock;
+ char *spage;
+ unsigned long size, offset, start;
+ vdi_t *vdi;
+
+ vdi = blkif_get_vdi(blkif, req->device);
+
+ if ( vdi == NULL )
+ goto err;
+
+ for (i = 0; i < req->nr_segments; i++) {
+
+ spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i);
+
+ /* Round the requested segment to a block address. */
+
+ sector = req->sector_number + (8*i);
+ vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT;
+
+ /* Get that block from the store. */
+
+ gblock = vdi_lookup_block(vdi, vblock, &writable);
+
+ /* Calculate read size and offset within the read block. */
+
+ offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE;
+ size = ( blkif_last_sect (req->frame_and_sects[i]) -
+ blkif_first_sect(req->frame_and_sects[i]) + 1
+ ) << SECTOR_SHIFT;
+ start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT;
+/*
+if (( gblock != 0 ) && ( writable == 0 )) printf("*");
+*/
+ DPRINTF("ParallaxWrite: sect: %lld (%ld,%ld), "
+ "vblock %llx, gblock %llx, "
+ "size %lx\n",
+ sector, blkif_first_sect(req->frame_and_sects[i]),
+ blkif_last_sect (req->frame_and_sects[i]),
+ vblock, gblock, size);
+
+ /* XXX: For now we just freak out if they try to write a */
+ /* non block-sized, block-aligned page. */
+
+ if ((offset != 0) || (size != BLOCK_SIZE) || (start != 0)) {
+ printf("]\n] STRANGE WRITE!\n]\n");
+ goto err;
+ }
+/* Disable write-in-place till radix is sorted out.
+ if (( gblock == 0 ) || ( writable == 0 )) {
+*/
+ gblock = allocblock(spage);
+ vdi_update_block(vdi, vblock, gblock);
+#if 0
+ } else {
+
+ /* write-in-place, no need to change mappings. */
+ writeblock(gblock, spage);
+
+ }
+#endif
+ }
+
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_WRITE;
+ rsp->status = BLKIF_RSP_OKAY;
+
+ return BLKTAP_RESPOND;
+err:
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = BLKIF_OP_WRITE;
+ rsp->status = BLKIF_RSP_ERROR;
+
+ return BLKTAP_RESPOND;
+}
+
+int parallax_request(blkif_request_t *req)
+{
+ blkif_response_t *rsp;
+ domid_t dom = ID_TO_DOM(req->id);
+ blkif_t *blkif = blkif_find_by_handle(dom, 0);
+
+ //DPRINTF("parallax_request: req=%p, dom=%d, blkif=%p\n", req, dom, blkif);
+
+ if (blkif == NULL)
+ goto err;
+
+ if ( req->operation == BLKIF_OP_PROBE ) {
+
+ return parallax_probe(req, blkif);
+
+ } else if ( req->operation == BLKIF_OP_READ ) {
+
+ return parallax_read(req, blkif);
+
+ } else if ( req->operation == BLKIF_OP_WRITE ) {
+
+ return parallax_write(req, blkif);
+
+ } else {
+ /* Unknown operation */
+ goto err;
+ }
+
+err:
+ rsp = (blkif_response_t *)req;
+ rsp->id = req->id;
+ rsp->operation = req->operation;
+ rsp->status = BLKIF_RSP_ERROR;
+ return BLKTAP_RESPOND;
+}
+
+void __init_parallax(void)
+{
+ memset(blkif_hash, 0, sizeof(blkif_hash));
+}
+
+
+int main(int argc, char *argv[])
+{
+ DPRINTF("parallax: starting.\n");
+ __init_blockstore();
+ DPRINTF("parallax: initialized blockstore...\n");
+ __init_vdi();
+ DPRINTF("parallax: initialized vdi registry etc...\n");
+ __init_parallax();
+ DPRINTF("parallax: initialized local stuff..\n");
+
+ blktap_register_ctrl_hook("parallax_control", parallax_control);
+ blktap_register_request_hook("parallax_request", parallax_request);
+ DPRINTF("parallax: added ctrl + request hooks, starting listen...\n");
+ blktap_listen();
+
+ return 0;
+}
--- /dev/null
+/*
+ * Radix tree for mapping (up to) 63-bit virtual block IDs to
+ * 63-bit global block IDs
+ *
+ * Pointers within the tree set aside the least significant bit to indicate
+ * whther or not the target block is writable from this node.
+ *
+ * The block with ID 0 is assumed to be an empty block of all zeros
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include "blockstore.h"
+#include "radix.h"
+
+#define RADIX_TREE_MAP_SHIFT 9
+#define RADIX_TREE_MAP_MASK 0x1ff
+#define RADIX_TREE_MAP_ENTRIES 512
+
+/*
+#define DEBUG
+*/
+
+#define ZERO 0LL
+#define ONE 1LL
+#define ONEMASK 0xffffffffffffffeLL
+
+
+typedef u64 *radix_tree_node;
+
+/*
+ * block device interface and other helper functions
+ * with these functions, block id is just a 63-bit number, with
+ * no special consideration for the LSB
+ */
+radix_tree_node cloneblock(radix_tree_node block);
+
+/*
+ * main api
+ * with these functions, the LSB of root always indicates
+ * whether or not the block is writable, including the return
+ * values of update and snapshot
+ */
+u64 lookup(int height, u64 root, u64 key);
+u64 update(int height, u64 root, u64 key, u64 val);
+u64 snapshot(u64 root);
+
+/**
+ * cloneblock: clone an existing block in memory
+ * @block: the old block
+ *
+ * @return: new block, with LSB cleared for every entry
+ */
+radix_tree_node cloneblock(radix_tree_node block) {
+ radix_tree_node node = (radix_tree_node) malloc(BLOCK_SIZE);
+ int i;
+ if (node == NULL) {
+ perror("cloneblock malloc");
+ return NULL;
+ }
+ for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
+ node[i] = block[i] & ONEMASK;
+ return node;
+}
+
+/**
+ * lookup: find a value given a key
+ * @height: height in bits of the radix tree
+ * @root: root node id, with set LSB indicating writable node
+ * @key: key to lookup
+ *
+ * @return: value on success, zero on error
+ */
+u64 lookup(int height, u64 root, u64 key) {
+ radix_tree_node node;
+
+ assert(key >> height == 0);
+
+ /* the root block may be smaller to ensure all leaves are full */
+ height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+
+ /* now carve off equal sized chunks at each step */
+ for (;;) {
+ u64 oldroot;
+
+#ifdef DEBUG
+ printf("lookup: height=%3d root=%3Ld offset=%3d%s\n", height, root,
+ (int) ((key >> height) & RADIX_TREE_MAP_MASK),
+ (iswritable(root) ? "" : " (readonly)"));
+#endif
+
+ if (getid(root) == ZERO)
+ return ZERO;
+
+ oldroot = root;
+ node = (radix_tree_node) readblock(getid(root));
+ if (node == NULL)
+ return ZERO;
+
+ root = node[(key >> height) & RADIX_TREE_MAP_MASK];
+ freeblock(node);
+
+ if (height == 0)
+ return root;
+
+ height -= RADIX_TREE_MAP_SHIFT;
+ }
+
+ return ZERO;
+}
+
+/*
+ * update: set a radix tree entry, doing copy-on-write as necessary
+ * @height: height in bits of the radix tree
+ * @root: root node id, with set LSB indicating writable node
+ * @key: key to set
+ * @val: value to set, s.t. radix(key)=val
+ *
+ * @returns: (possibly new) root id on success (with LSB=1), 0 on failure
+ */
+u64 update(int height, u64 root, u64 key, u64 val) {
+ int offset;
+ u64 child;
+ radix_tree_node node;
+
+ /* base case--return val */
+ if (height == 0)
+ return val;
+
+ /* the root block may be smaller to ensure all leaves are full */
+ height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+ offset = (key >> height) & RADIX_TREE_MAP_MASK;
+
+#ifdef DEBUG
+ printf("update: height=%3d root=%3Ld offset=%3d%s\n", height, root,
+ offset, (iswritable(root)?"":" (clone)"));
+#endif
+
+ /* load a block, or create a new one */
+ if (root == ZERO) {
+ node = (radix_tree_node) newblock();
+ } else {
+ node = (radix_tree_node) readblock(getid(root));
+
+ if (!iswritable(root)) {
+ /* need to clone this node */
+ radix_tree_node oldnode = node;
+ node = cloneblock(node);
+ freeblock(oldnode);
+ root = ZERO;
+ }
+ }
+
+ if (node == NULL) {
+#ifdef DEBUG
+ printf("update: node is null!\n");
+#endif
+ return ZERO;
+ }
+
+ child = update(height, node[offset], key, val);
+
+ if (child == ZERO) {
+ freeblock(node);
+ return ZERO;
+ } else if (child == node[offset]) {
+ /* no change, so we already owned the child */
+ assert(iswritable(root));
+
+ freeblock(node);
+ return root;
+ }
+
+ node[offset] = child;
+
+ /* new/cloned blocks need to be saved */
+ if (root == ZERO) {
+ /* mark this as an owned block */
+ root = allocblock(node);
+ if (root)
+ root = writable(root);
+ } else if (writeblock(getid(root), node) < 0) {
+ freeblock(node);
+ return ZERO;
+ }
+
+ freeblock(node);
+ return root;
+}
+
+/**
+ * snapshot: create a snapshot
+ * @root: old root node
+ *
+ * @return: new root node, 0 on error
+ */
+u64 snapshot(u64 root) {
+ radix_tree_node node, newnode;
+
+ if ((node = readblock(getid(root))) == NULL)
+ return ZERO;
+
+ newnode = cloneblock(node);
+ freeblock(node);
+ if (newnode == NULL)
+ return ZERO;
+
+ root = allocblock(newnode);
+ freeblock(newnode);
+
+ if (root == ZERO)
+ return ZERO;
+ else
+ return writable(root);
+}
+
+void print_root(u64 root, int height, u64 val, FILE *dot_f)
+{
+ FILE *f;
+ int i;
+ radix_tree_node node;
+ char *style[2] = { "", "style=bold,color=blue," };
+
+ if (dot_f == NULL) {
+ f = fopen("radix.dot", "w");
+ if (f == NULL) {
+ perror("print_root: open");
+ return;
+ }
+
+ /* write graph preamble */
+ fprintf(f, "digraph G {\n");
+
+ /* add a node for this root. */
+ fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n",
+ getid(root), style[iswritable(root)], getid(root));
+ }
+
+ /* base case--return val */
+ if (height == 0) {
+ /* add a node and edge for each child root */
+ node = (radix_tree_node) readblock(getid(root));
+ if (node == NULL)
+ return;
+
+ for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++) {
+ if (node[i] != 0) {
+ fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n",
+ getid(node[i]), style[iswritable(node[i])],
+ getid(node[i]));
+ fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root),
+ getid(node[i]), i);
+ }
+ }
+ return;
+ }
+
+ /* the root block may be smaller to ensure all leaves are full */
+ height = ((height - 1) / RADIX_TREE_MAP_SHIFT) * RADIX_TREE_MAP_SHIFT;
+
+ if (getid(root) == ZERO)
+ return;
+
+ node = (radix_tree_node) readblock(getid(root));
+ if (node == NULL)
+ return;
+
+ /* add a node and edge for each child root */
+ for (i = 0; i < RADIX_TREE_MAP_ENTRIES; i++)
+ if (node[i] != 0) {
+ fprintf(f, " n%Ld [%sshape=box,label=\"%Ld\"];\n",
+ getid(node[i]), style[iswritable(node[i])],
+ getid(node[i]));
+ print_root(node[i], height-RADIX_TREE_MAP_SHIFT,
+ val + (((u64)i)<<height), f);
+ fprintf(f, " n%Ld -> n%Ld [label=\"%d\"]\n", getid(root),
+ getid(node[i]), i);
+ }
+
+ /*
+
+ root = node[(key >> height) & RADIX_TREE_MAP_MASK];
+ freeblock(state, getid(oldroot), node);
+
+ if (height == 0)
+ return root;
+
+ height -= RADIX_TREE_MAP_SHIFT;
+ */
+ //}
+
+
+ /* write graph postamble */
+ if (dot_f == NULL) {
+ fprintf(f, "}\n");
+ fclose(f);
+ }
+}
+
+#ifdef RADIX_STANDALONE
+
+int main(int argc, char **argv) {
+ u64 key = ZERO, val = ZERO;
+ u64 root = writable(ONE);
+ char buff[4096];
+
+ __init_blockstore();
+
+ memset(buff, 0, 4096);
+ /*fp = open("radix.dat", O_RDWR | O_CREAT, 0644);
+
+ if (fp < 3) {
+ perror("open");
+ return -1;
+ }
+ if (lseek(fp, 0, SEEK_END) == 0) {
+ write(fp, buff, 4096);
+ }*/
+
+ printf("Recognized commands:\n"
+ "Note: the LSB of a node number indicates if it is writable\n"
+ " root <node> set root to <node>\n"
+ " snapshot take a snapshot of the root\n"
+ " set <key> <val> set key=val\n"
+ " get <key> query key\n"
+ " quit\n"
+ "\nroot = %Ld\n", root);
+ for (;;) {
+ print_root(root, 34, 0, NULL);
+ system("dot radix.dot -Tps -o radix.ps");
+
+ printf("> ");
+ fflush(stdout);
+ fgets(buff, 1024, stdin);
+ if (feof(stdin))
+ break;
+ if (sscanf(buff, " root %Ld", &root) == 1) {
+ printf("root set to %Ld\n", root);
+ } else if (sscanf(buff, " set %Ld %Ld", &key, &val) == 2) {
+ root = update(34, root, key, val);
+ printf("root = %Ld\n", root);
+ } else if (sscanf(buff, " get %Ld", &key) == 1) {
+ val = lookup(34, root, key, NULL);
+ printf("value = %Ld\n", val);
+ } else if (!strcmp(buff, "quit\n")) {
+ break;
+ } else if (!strcmp(buff, "snapshot\n")) {
+ root = snapshot(root);
+ printf("new root = %Ld\n", root);
+ } else if (sscanf(buff, " pr %Ld", &root) == 1) {
+ print_root(root, 34, 0, NULL);
+ } else {
+ printf("command not recognized\n");
+ }
+ }
+ return 0;
+}
+
+#endif
--- /dev/null
+/*
+ * Radix tree for mapping (up to) 63-bit virtual block IDs to
+ * 63-bit global block IDs
+ *
+ * Pointers within the tree set aside the least significant bit to indicate
+ * whther or not the target block is writable from this node.
+ *
+ * The block with ID 0 is assumed to be an empty block of all zeros
+ */
+
+#ifndef __RADIX_H__
+#define __RADIX_H__
+
+/* I don't really like exposing these, but... */
+#define getid(x) (((x)>>1)&0x7fffffffffffffffLL)
+#define putid(x) ((x)<<1)
+#define writable(x) (((x)<<1)|1LL)
+#define iswritable(x) ((x)&1LL)
+
+/*
+ * main api
+ * with these functions, the LSB of root always indicates
+ * whether or not the block is writable, including the return
+ * values of update and snapshot
+ */
+u64 lookup(int height, u64 root, u64 key);
+u64 update(int height, u64 root, u64 key, u64 val);
+u64 snapshot(u64 root);
+int isprivate(int height, u64 root, u64 key);
+
+#endif /* __RADIX_H__ */
--- /dev/null
+/**************************************************************************
+ *
+ * snaplog.c
+ *
+ * Snapshot log on-disk data structure.
+ *
+ */
+
+ /* VDI histories are made from chains of snapshot logs. These logs record
+ * the (radix) root and timestamp of individual snapshots.
+ *
+ * creation of a new VDI involves 'forking' a snapshot log, by creating a
+ * new, empty log (in a new VDI) and parenting it off of a record in an
+ * existing snapshot log.
+ *
+ * snapshot log blocks have at most one writer.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "snaplog.h"
+
+
+
+snap_block_t *snap_get_block(u64 block)
+{
+ snap_block_t *blk = (snap_block_t *)readblock(block);
+
+ if ( blk == NULL)
+ return NULL;
+ if ( blk->hdr.magic != SNAP_MAGIC ) {
+ freeblock(blk);
+ return NULL;
+ }
+
+ return blk;
+}
+
+int snap_get_id(snap_id_t *id, snap_rec_t *target)
+{
+ snap_block_t *blk;
+
+ if ( id == NULL )
+ return -1;
+
+ blk = snap_get_block(id->block);
+
+ if ( blk == NULL )
+ return -1;
+
+ if ( id->index > blk->hdr.nr_entries ) {
+ freeblock(blk);
+ return -1;
+ }
+
+ *target = blk->snaps[id->index];
+ freeblock(blk);
+ return 0;
+}
+
+int __snap_block_create(snap_id_t *parent_id, snap_id_t *fork_id,
+ snap_id_t *new_id)
+{
+ snap_rec_t parent_rec, fork_rec;
+ snap_block_t *blk, *pblk;
+ /*
+ if ( (parent_id != NULL) && (snap_get_id(parent_id, &parent_rec) != 0) )
+ return -1;
+
+ if ( (fork_id != NULL) && (snap_get_id(fork_id, &fork_rec) != 0) )
+ return -1;
+*/
+ blk = (snap_block_t *)newblock();
+ blk->hdr.magic = SNAP_MAGIC;
+ blk->hdr.nr_entries = 0;
+ blk->hdr.log_entries = 0;
+ blk->hdr.immutable = 0;
+
+ if ( (parent_id != NULL)
+ && (parent_id->block != fork_id->block)
+ && (parent_id->block != 0)) {
+
+ pblk = snap_get_block(parent_id->block);
+ blk->hdr.log_entries = pblk->hdr.log_entries;
+ freeblock(pblk);
+ }
+
+ if (parent_id != NULL) {
+ blk->hdr.parent_block = *parent_id;
+ blk->hdr.fork_block = *fork_id;
+ } else {
+ blk->hdr.parent_block = null_snap_id;
+ blk->hdr.fork_block = null_snap_id;
+ }
+
+ new_id->index = 0;
+ new_id->block = allocblock(blk);
+ if (new_id->block == 0)
+ return -1;
+
+ return 0;
+}
+
+int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id)
+{
+ return __snap_block_create(parent_id, parent_id, new_id);
+}
+
+int snap_append(snap_id_t *old_id, snap_rec_t *rec, snap_id_t *new_id)
+{
+ snap_id_t id = *old_id;
+ snap_block_t *blk = snap_get_block(id.block);
+
+ if ( blk->hdr.immutable != 0 ) {
+ printf("Attempt to snap an immutable snap block!\n");
+ return -1;
+ }
+
+ new_id->block = id.block;
+
+ if (blk->hdr.nr_entries == SNAPS_PER_BLOCK) {
+ int ret;
+
+ id.index--; /* make id point to the last full record */
+
+ ret = __snap_block_create(&id, &blk->hdr.fork_block, new_id);
+ if ( ret != 0 ) {
+ freeblock(blk);
+ return -1;
+ }
+
+ blk->hdr.immutable = 1;
+ writeblock(id.block, blk);
+ freeblock(blk);
+ blk = snap_get_block(new_id->block);
+ id = *new_id;
+ }
+
+ blk->snaps[blk->hdr.nr_entries] = *rec;
+ blk->hdr.nr_entries++;
+ blk->hdr.log_entries++;
+ new_id->index = blk->hdr.nr_entries;
+ //printf("snap: %u %u\n", blk->hdr.nr_entries, blk->hdr.log_entries);
+ writeblock(id.block, blk);
+ freeblock(blk);
+ return 0;
+}
+
+void snap_print_history(snap_id_t *snap_id)
+{
+ snap_id_t id = *snap_id;
+ unsigned int idx = id.index;
+ snap_block_t *new_blk, *blk = snap_get_block(id.block);
+
+ while ( blk ) {
+ printf("[Snap block %Ld]:\n", id.block);
+ do {
+ printf(" %03u: root: %Ld ts: %ld.%ld\n", idx,
+ blk->snaps[idx].radix_root,
+ blk->snaps[idx].timestamp.tv_sec,
+ blk->snaps[idx].timestamp.tv_usec);
+ } while (idx-- != 0);
+
+ id = blk->hdr.parent_block;
+ if (id.block != 0) {
+ new_blk = snap_get_block(id.block);
+ }
+ freeblock(blk);
+ blk = new_blk;
+ }
+}
--- /dev/null
+/**************************************************************************
+ *
+ * snaplog.h
+ *
+ * Snapshot log on-disk data structure.
+ *
+ */
+
+#include "blockstore.h" /* for BLOCK_SIZE */
+
+typedef struct snap_id {
+ u64 block;
+ unsigned int index;
+} snap_id_t;
+
+typedef struct snap_rec {
+ u64 radix_root;
+ struct timeval timestamp;
+} snap_rec_t;
+
+
+int snap_block_create(snap_id_t *parent_id, snap_id_t *new_id);
+int snap_append(snap_id_t *id, snap_rec_t *rec, snap_id_t *new_id);
+void snap_print_history(snap_id_t *snap_id);
+int snap_get_id(snap_id_t *id, snap_rec_t *target);
+
+
+/* exported for vdi debugging */
+#define SNAP_MAGIC 0xff00ff0aa0ff00ffLL
+
+static const snap_id_t null_snap_id = { 0, 0 };
+
+typedef struct snap_block_hdr {
+ u64 magic;
+ snap_id_t parent_block; /* parent block within this chain */
+ snap_id_t fork_block; /* where this log was forked */
+ unsigned log_entries; /* total entries since forking */
+ unsigned short nr_entries; /* entries in snaps[] */
+ unsigned short immutable; /* has this snap page become immutable? */
+} snap_block_hdr_t;
+
+
+#define SNAPS_PER_BLOCK \
+ ((BLOCK_SIZE - sizeof(snap_block_hdr_t)) / sizeof(snap_rec_t))
+
+typedef struct snap_block {
+ snap_block_hdr_t hdr;
+ snap_rec_t snaps[SNAPS_PER_BLOCK];
+} snap_block_t;
+
+
+snap_block_t *snap_get_block(u64 block);
--- /dev/null
+/**************************************************************************
+ *
+ * vdi.c
+ *
+ * Virtual Disk Image (VDI) Interfaces
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+#define VDI_REG_BLOCK 1LL
+#define VDI_RADIX_ROOT writable(2)
+
+#if 1
+#define DPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DPRINTF(_f, _a...) ((void)0)
+#endif
+
+/* I haven't decided about this registry stuff, so this is just a really
+ * quick lash-up so that there is some way to track VDIs.
+ *
+ * (Most vdi access should be with a direct handle to the block, so this
+ * registry is just for start-of-day lookup and other control operations.)
+ */
+
+vdi_registry_t *create_vdi_registry(void)
+{
+ vdi_registry_t *reg = (vdi_registry_t *)newblock();
+
+ if (reg == NULL)
+ return NULL;
+
+ /* zero-fill the vdi radix root while we have an empty block. */
+ writeblock(VDI_RADIX_ROOT, (void *)reg);
+
+
+ DPRINTF("[vdi.c] Creating VDI registry!\n");
+ reg->magic = VDI_REG_MAGIC;
+ reg->nr_vdis = 0;
+
+ writeblock(VDI_REG_BLOCK, (void *)reg);
+
+ return reg;
+}
+
+vdi_registry_t *get_vdi_registry(void)
+{
+ vdi_registry_t *vdi_reg = (vdi_registry_t *)readblock(VDI_REG_BLOCK);
+
+ if ( vdi_reg == NULL )
+ vdi_reg = create_vdi_registry();
+
+ if ( vdi_reg->magic != VDI_REG_MAGIC ) {
+ freeblock(vdi_reg);
+ return NULL;
+ }
+
+ return vdi_reg;
+}
+
+vdi_t *vdi_create(snap_id_t *parent_snap, char *name)
+{
+ int ret;
+ vdi_t *vdi;
+ vdi_registry_t *vdi_reg;
+ snap_rec_t snap_rec;
+
+ /* create a vdi struct */
+ vdi = newblock();
+ if (vdi == NULL)
+ return NULL;
+
+ if ( snap_get_id(parent_snap, &snap_rec) == 0 ) {
+ vdi->radix_root = snapshot(snap_rec.radix_root);
+ } else {
+ vdi->radix_root = allocblock((void *)vdi); /* vdi is just zeros here */
+ vdi->radix_root = writable(vdi->radix_root); /* grr. */
+ }
+
+ /* create a snapshot log, and add it to the vdi struct */
+
+ ret = snap_block_create(parent_snap, &vdi->snap);
+ if ( ret != 0 ) {
+ DPRINTF("Error getting snap block in vdi_create.\n");
+ freeblock(vdi);
+ return NULL;
+ }
+
+ /* append the vdi to the registry, fill block and id. */
+ /* implicit allocation means we have to write the vdi twice here. */
+ vdi_reg = get_vdi_registry();
+ if ( vdi_reg == NULL ) {
+ freeblock(vdi);
+ return NULL;
+ }
+
+ vdi->block = allocblock((void *)vdi);
+ vdi->id = vdi_reg->nr_vdis++;
+ strncpy(vdi->name, name, VDI_NAME_SZ);
+ vdi->name[VDI_NAME_SZ] = '\0';
+ writeblock(vdi->block, (void *)vdi);
+
+ update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block);
+ writeblock(VDI_REG_BLOCK, (void *)vdi_reg);
+ freeblock(vdi_reg);
+
+ return vdi;
+}
+
+vdi_t *vdi_get(u64 vdi_id)
+{
+ u64 vdi_blk;
+ vdi_t *vdi;
+
+ vdi_blk = lookup(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi_id);
+
+ if ( vdi_blk == 0 )
+ return NULL;
+
+ vdi = (vdi_t *)readblock(vdi_blk);
+ return vdi;
+}
+
+u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable)
+{
+ u64 gblock;
+
+ gblock = lookup(VDI_HEIGHT, vdi->radix_root, vdi_block);
+
+ if (writable != NULL) *writable = iswritable(gblock);
+printf("lu: root: %11Ld, gblock: %11Ld, id: %11Ld, wr: %Ld\n",
+ vdi->radix_root, gblock, getid(gblock), iswritable(gblock));
+
+ return getid(gblock);
+}
+
+void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block)
+{
+ u64 id;
+
+ /* updates are always writable. */
+ id = writable(g_block);
+
+ vdi->radix_root = update(VDI_HEIGHT, vdi->radix_root, vdi_block, id);
+ writeblock(vdi->block, vdi);
+}
+
+void vdi_snapshot(vdi_t *vdi)
+{
+ snap_rec_t rec;
+ int ret;
+
+ rec.radix_root = vdi->radix_root;
+ gettimeofday(&rec.timestamp, NULL);
+
+ vdi->radix_root = snapshot(vdi->radix_root);
+ ret = snap_append(&vdi->snap, &rec, &vdi->snap);
+ if ( ret != 0 ) {
+ printf("snap_append returned failure\n");
+ return;
+ }
+ writeblock(vdi->block, vdi);
+}
+
+int __init_vdi()
+{
+ /* force the registry to be created if it doesn't exist. */
+ vdi_registry_t *vdi_reg = get_vdi_registry();
+ if (vdi_reg == NULL) {
+ printf("[vdi.c] Couldn't get/create a VDI registry!\n");
+ return -1;
+ }
+ freeblock(vdi_reg);
+
+ return 0;
+}
+
+#ifdef VDI_STANDALONE
+
+#define TEST_VDIS 50
+#define NR_ITERS 50000
+#define FORK_POINTS 200
+#define INIT_VDIS 3
+#define INIT_SNAPS 40
+
+/* These must be of decreasing size: */
+#define NEW_FORK (RAND_MAX-(RAND_MAX/1000))
+#define NEW_ROOT_VDI (RAND_MAX-((RAND_MAX/1000)*2))
+#define NEW_FORK_VDI (RAND_MAX-((RAND_MAX/1000)*3))
+
+#define GRAPH_DOT_FILE "vdi.dot"
+#define GRAPH_PS_FILE "vdi.ps"
+
+
+typedef struct sh_st {
+ snap_id_t id;
+ struct sh_st *next;
+} sh_t;
+
+#define SNAP_HASHSZ 1024
+sh_t *node_hash[SNAP_HASHSZ];
+#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
+
+#define SNAPID_EQUAL(_a,_b) \
+ (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
+int sh_check_and_add(snap_id_t *id)
+{
+ sh_t **s = &node_hash[SNAP_HASH(id)];
+
+ while (*s != NULL) {
+ if (SNAPID_EQUAL(&((*s)->id), id))
+ return 1;
+ *s = (*s)->next;
+ }
+
+ *s = (sh_t *)malloc(sizeof(sh_t));
+ (*s)->id = *id;
+ (*s)->next = NULL;
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi_list[TEST_VDIS];
+ snap_id_t id, fork_points[FORK_POINTS];
+ int nr_vdis = 0, nr_forks = 0;
+ int i, j, r;
+ FILE *f;
+ char name[VDI_NAME_SZ];
+
+ __init_blockstore();
+ __init_vdi();
+
+ printf("[o] Generating seed VDIs. (%d VDIs)\n", INIT_VDIS);
+
+ for (i=0; i<INIT_VDIS; i++) {
+ r=rand();
+
+ sprintf(name, "VDI Number %d", nr_vdis);
+ vdi_list[i] = vdi_create(NULL, name);
+ for (j=0; j<(r%INIT_SNAPS); j++)
+ vdi_snapshot(vdi_list[i]);
+ fork_points[i] = vdi_list[i]->snap;
+ nr_vdis++;
+ nr_forks++;
+ }
+
+ printf("[o] Running a random workload. (%d iterations)\n", NR_ITERS);
+
+ for (i=0; i<NR_ITERS; i++) {
+ r = rand();
+
+ if ( r > NEW_FORK ) {
+ if ( nr_forks > FORK_POINTS )
+ continue;
+ id = vdi_list[r%nr_vdis]->snap;
+ if ( ( id.block == 0 ) || ( id.index == 0 ) )
+ continue;
+ id.index--;
+ fork_points[nr_forks++] = id;
+
+ } else if ( r > NEW_ROOT_VDI ) {
+
+ if ( nr_vdis == TEST_VDIS )
+ continue;
+
+ sprintf(name, "VDI Number %d.", nr_vdis);
+ vdi_list[nr_vdis++] = vdi_create(NULL, name);
+
+ } else if ( r > NEW_FORK_VDI ) {
+
+ if ( nr_vdis == TEST_VDIS )
+ continue;
+
+ sprintf(name, "VDI Number %d.", nr_vdis);
+ vdi_list[nr_vdis++] = vdi_create(&fork_points[r%nr_forks], name);
+
+ } else /* SNAPSHOT */ {
+
+ vdi_snapshot(vdi_list[r%nr_vdis]);
+
+ }
+ }
+
+ /* now dump it out to a dot file. */
+ printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
+
+ f = fopen(GRAPH_DOT_FILE, "w");
+
+ /* write graph preamble */
+ fprintf(f, "digraph G {\n");
+ fprintf(f, " rankdir=LR\n");
+
+ for (i=0; i<nr_vdis; i++) {
+ char oldnode[255];
+ snap_block_t *blk;
+ snap_id_t id = vdi_list[i]->snap;
+ int nr_snaps, done=0;
+
+ /* add a node for the id */
+printf("vdi: %d\n", i);
+ fprintf(f, " n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n",
+ id.block, id.index, vdi_list[i]->name,
+ id.block, id.index);
+ sprintf(oldnode, "n%Ld%d", id.block, id.index);
+
+ while (id.block != 0) {
+ blk = snap_get_block(id.block);
+ nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
+ id = blk->hdr.fork_block;
+
+ done = sh_check_and_add(&id);
+
+ /* add a node for the fork_id */
+ if (!done) {
+ fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n",
+ id.block, id.index,
+ id.block, id.index);
+ }
+
+ /* add an edge between them */
+ fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n",
+ id.block, id.index, oldnode, nr_snaps);
+ sprintf(oldnode, "n%Ld%d", id.block, id.index);
+ freeblock(blk);
+
+ if (done) break;
+ }
+ }
+
+ /* write graph postamble */
+ fprintf(f, "}\n");
+ fclose(f);
+
+ printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
+ {
+ char cmd[255];
+ sprintf(cmd, "dot %s -Tps -o %s", GRAPH_DOT_FILE, GRAPH_PS_FILE);
+ system(cmd);
+ }
+ return 0;
+}
+
+#endif
--- /dev/null
+/**************************************************************************
+ *
+ * vdi.h
+ *
+ * Virtual Disk Image (VDI) Interfaces
+ *
+ */
+
+#ifndef __VDI_H__
+#define __VDI_H__
+
+#include "blktaplib.h"
+#include "snaplog.h"
+
+#define VDI_HEIGHT 35
+#define VDI_REG_HEIGHT 35 /* why not? */
+
+#define VDI_NAME_SZ 256
+
+typedef struct vdi {
+ u64 id; /* unique vdi id -- used by the registry */
+ u64 block; /* block where this vdi lives (also unique)*/
+ u64 radix_root; /* radix root node for block mappings */
+ snap_id_t snap; /* next snapshot slot for this VDI */
+ struct vdi *next; /* used to hash-chain in blkif. */
+ blkif_vdev_t vdevice; /* currently mounted as... */
+ char name[VDI_NAME_SZ];/* human readable vdi name */
+} vdi_t;
+
+#define VDI_REG_MAGIC 0xff00ff0bb0ff00ffLL
+
+typedef struct vdi_registry {
+ u64 magic;
+ u64 nr_vdis;
+} vdi_registry_t;
+
+
+int __init_vdi(void);
+
+vdi_t *vdi_get(u64 vdi_id);
+vdi_registry_t *get_vdi_registry(void);
+vdi_t *vdi_create(snap_id_t *parent_snap, char *name);
+u64 vdi_lookup_block(vdi_t *vdi, u64 vdi_block, int *writable);
+void vdi_update_block(vdi_t *vdi, u64 vdi_block, u64 g_block);
+void vdi_snapshot(vdi_t *vdi);
+
+
+#endif /* __VDI_H__ */
--- /dev/null
+/**************************************************************************
+ *
+ * vdi_create.c
+ *
+ * Create a new vdi.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ char name[VDI_NAME_SZ] = "";
+ snap_id_t id;
+ int from_snap = 0;
+
+ __init_blockstore();
+ __init_vdi();
+
+ if ( argc == 1 ) {
+ printf("usage: %s <VDI Name> [<snap block> <snap idx>]\n", argv[0]);
+ exit(-1);
+ }
+
+ strncpy( name, argv[1], VDI_NAME_SZ);
+ name[VDI_NAME_SZ] = '\0';
+
+ if ( argc > 3 ) {
+ id.block = (u64) atoll(argv[2]);
+ id.index = (unsigned int) atol (argv[3]);
+ from_snap = 1;
+ }
+
+ vdi = vdi_create( from_snap ? &id : NULL, name);
+
+ if ( vdi == NULL ) {
+ printf("Failed to create VDI!\n");
+ freeblock(vdi);
+ exit(-1);
+ }
+
+ freeblock(vdi);
+
+ return (0);
+}
--- /dev/null
+/**************************************************************************
+ *
+ * vdi_fill.c
+ *
+ * Hoover a file or device into a vdi.
+ * You must first create the vdi with vdi_create.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ u64 id;
+ int fd;
+ struct stat st;
+ u64 tot_size;
+ char spage[BLOCK_SIZE];
+ char *dpage;
+ u64 vblock = 0, count=0;
+
+ __init_blockstore();
+ __init_vdi();
+
+ if ( argc < 3 ) {
+ printf("usage: %s <VDI id> <filename>\n", argv[0]);
+ exit(-1);
+ }
+
+ id = (u64) atoll(argv[1]);
+
+ vdi = vdi_get( id );
+
+ if ( vdi == NULL ) {
+ printf("Failed to retreive VDI %Ld!\n", id);
+ exit(-1);
+ }
+
+ fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+
+ if (fd < 0) {
+ printf("Couldn't open %s!\n", argv[2]);
+ exit(-1);
+ }
+
+ if ( fstat(fd, &st) != 0 ) {
+ printf("Couldn't stat %s!\n", argv[2]);
+ exit(-1);
+ }
+
+ tot_size = (u64) st.st_size;
+ printf("Filling VDI %Ld with %Ld bytes.\n", id, tot_size);
+
+ printf("%011Ld blocks total\n", tot_size / BLOCK_SIZE);
+ printf(" ");
+ while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
+ u64 gblock = 0;
+
+ gblock = allocblock(spage);
+ vdi_update_block(vdi, vblock, gblock);
+
+ vblock++;
+ if ((vblock % 512) == 0)
+ printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
+ fflush(stdout);
+ }
+ printf("\n");
+
+ freeblock(vdi);
+
+ return (0);
+}
--- /dev/null
+/**************************************************************************
+ *
+ * vdi_list.c
+ *
+ * Print a list of VDIs on the block store.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_registry_t *reg;
+ vdi_t *vdi;
+ int i;
+
+ __init_blockstore();
+ __init_vdi();
+
+ reg = get_vdi_registry();
+
+ if ( reg == NULL ) {
+ printf("couldn't get VDI registry.\n");
+ exit(-1);
+ }
+
+ for (i=0; i < reg->nr_vdis; i++) {
+ vdi = vdi_get(i);
+
+ if ( vdi != NULL ) {
+
+ printf("%10Ld %60s\n", vdi->id, vdi->name);
+ freeblock(vdi);
+
+ }
+ }
+
+ freeblock(reg);
+
+ return 0;
+}
--- /dev/null
+/**************************************************************************
+ *
+ * vdi_snap.c
+ *
+ * Snapshot a vdi.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ u64 id;
+
+ __init_blockstore();
+ __init_vdi();
+
+ if ( argc == 1 ) {
+ printf("usage: %s <VDI id>\n", argv[0]);
+ exit(-1);
+ }
+
+ id = (u64) atoll(argv[1]);
+
+ vdi = vdi_get(id);
+
+ if ( vdi == NULL ) {
+ printf("couldn't find the requested VDI.\n");
+ freeblock(vdi);
+ exit(-1);
+ }
+
+ vdi_snapshot(vdi);
+
+ return 0;
+}
--- /dev/null
+/**************************************************************************
+ *
+ * vdi_snap_list.c
+ *
+ * Print a list of snapshots for the specified vdi.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ u64 id;
+ int i, max_snaps = -1;
+ snap_block_t *blk;
+ snap_id_t sid;
+ char *t;
+
+ __init_blockstore();
+ __init_vdi();
+
+ if ( argc == 1 ) {
+ printf("usage: %s <VDI id> [max snaps]\n", argv[0]);
+ exit(-1);
+ }
+
+ id = (u64) atoll(argv[1]);
+
+ if ( argc > 2 ) {
+ max_snaps = atoi(argv[2]);
+ }
+
+ vdi = vdi_get(id);
+
+ if ( vdi == NULL ) {
+ printf("couldn't find the requested VDI.\n");
+ freeblock(vdi);
+ exit(-1);
+ }
+
+ sid = vdi->snap;
+ sid.index--;
+
+ //printf("%6s%4s%21s %12s\n", "Block", "idx", "timestamp", "radix root");
+ printf("%6s%4s%37s %12s\n", "Block", "idx", "timestamp", "radix root");
+
+ while (sid.block != 0) {
+ blk = snap_get_block(sid.block);
+ for (i = sid.index; i >= 0; i--) {
+ if ( max_snaps == 0 ) {
+ freeblock(blk);
+ goto done;
+ }
+ t = ctime(&blk->snaps[i].timestamp.tv_sec);
+ t[strlen(t)-1] = '\0';
+ //printf("%6Ld%4u%14lu.%06lu %12Ld\n",
+ printf("%6Ld%4u%30s %06lu %12Ld\n",
+ sid.block, i,
+ //blk->snaps[i].timestamp.tv_sec,
+ t,
+ blk->snaps[i].timestamp.tv_usec,
+ blk->snaps[i].radix_root);
+ if ( max_snaps != -1 )
+ max_snaps--;
+ }
+ sid = blk->hdr.parent_block;
+ freeblock(blk);
+ }
+done:
+ return 0;
+}
--- /dev/null
+/**************************************************************************
+ *
+ * vdi_tree.c
+ *
+ * Output current vdi tree to dot and postscript.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+#define GRAPH_DOT_FILE "vdi.dot"
+#define GRAPH_PS_FILE "vdi.ps"
+
+typedef struct sh_st {
+ snap_id_t id;
+ struct sh_st *next;
+} sh_t;
+
+#define SNAP_HASHSZ 1024
+sh_t *node_hash[SNAP_HASHSZ];
+#define SNAP_HASH(_id) (((int)(_id)->block^(_id)->index)%SNAP_HASHSZ)
+
+#define SNAPID_EQUAL(_a,_b) \
+ (((_a)->block==(_b)->block) && ((_a)->index==(_b)->index))
+int sh_check_and_add(snap_id_t *id)
+{
+ sh_t **s = &node_hash[SNAP_HASH(id)];
+
+ while (*s != NULL) {
+ if (SNAPID_EQUAL(&((*s)->id), id))
+ return 1;
+ *s = (*s)->next;
+ }
+
+ *s = (sh_t *)malloc(sizeof(sh_t));
+ (*s)->id = *id;
+ (*s)->next = NULL;
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ FILE *f;
+ char dot_file[255] = GRAPH_DOT_FILE;
+ char ps_file[255] = GRAPH_PS_FILE;
+ int nr_vdis = 0, nr_forks = 0;
+ vdi_registry_t *reg;
+ vdi_t *vdi;
+ int i;
+
+ __init_blockstore();
+ __init_vdi();
+
+ reg = get_vdi_registry();
+
+ if ( reg == NULL ) {
+ printf("couldn't get VDI registry.\n");
+ exit(-1);
+ }
+
+ if ( argc > 1 ) {
+ strncpy(ps_file, argv[1], 255);
+ ps_file[255] = '\0';
+ }
+
+ /* now dump it out to a dot file. */
+ printf("[o] Dumping state to a dot graph. (%d VDIs)\n", nr_vdis);
+
+ f = fopen(dot_file, "w");
+
+ /* write graph preamble */
+ fprintf(f, "digraph G {\n");
+ fprintf(f, " rankdir=LR\n");
+
+ for (i=0; i<reg->nr_vdis; i++) {
+ char oldnode[255];
+ snap_block_t *blk;
+ snap_id_t id;
+ int nr_snaps, done=0;
+
+ vdi = vdi_get(i);
+ id = vdi->snap;
+ /* add a node for the id */
+printf("vdi: %d\n", i);
+ fprintf(f, " n%Ld%d [color=blue,shape=box,label=\"%s\\nb:%Ld\\nidx:%d\"]\n",
+ id.block, id.index, vdi->name,
+ id.block, id.index);
+ sprintf(oldnode, "n%Ld%d", id.block, id.index);
+
+ while (id.block != 0) {
+ blk = snap_get_block(id.block);
+ nr_snaps = blk->hdr.log_entries - (blk->hdr.nr_entries - id.index);
+ id = blk->hdr.fork_block;
+
+ done = sh_check_and_add(&id);
+
+ /* add a node for the fork_id */
+ if (!done) {
+ fprintf(f, " n%Ld%d [shape=box,label=\"b:%Ld\\nidx:%d\"]\n",
+ id.block, id.index,
+ id.block, id.index);
+ }
+
+ /* add an edge between them */
+ fprintf(f, " n%Ld%d -> %s [label=\"%u snapshots\"]\n",
+ id.block, id.index, oldnode, nr_snaps);
+ sprintf(oldnode, "n%Ld%d", id.block, id.index);
+ freeblock(blk);
+
+ if (done) break;
+ }
+ }
+
+ /* write graph postamble */
+ fprintf(f, "}\n");
+ fclose(f);
+
+ printf("[o] Generating postscript graph. (%s)\n", GRAPH_PS_FILE);
+ {
+ char cmd[255];
+ sprintf(cmd, "dot %s -Tps -o %s", dot_file, ps_file);
+ system(cmd);
+ }
+ return 0;
+}
--- /dev/null
+/**************************************************************************
+ *
+ * vdi_validate.c
+ *
+ * Intended to sanity-check vm_fill and the underlying vdi code.
+ *
+ * Block-by-block compare of a vdi with a file/device on the disk.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "blockstore.h"
+#include "radix.h"
+#include "vdi.h"
+
+int main(int argc, char *argv[])
+{
+ vdi_t *vdi;
+ u64 id;
+ int fd;
+ struct stat st;
+ u64 tot_size;
+ char spage[BLOCK_SIZE], *dpage;
+ char *vpage;
+ u64 vblock = 0, count=0;
+
+ __init_blockstore();
+ __init_vdi();
+
+ if ( argc < 3 ) {
+ printf("usage: %s <VDI id> <filename>\n", argv[0]);
+ exit(-1);
+ }
+
+ id = (u64) atoll(argv[1]);
+
+ vdi = vdi_get( id );
+
+ if ( vdi == NULL ) {
+ printf("Failed to retreive VDI %Ld!\n", id);
+ exit(-1);
+ }
+
+ fd = open(argv[2], O_RDONLY | O_LARGEFILE);
+
+ if (fd < 0) {
+ printf("Couldn't open %s!\n", argv[2]);
+ exit(-1);
+ }
+
+ if ( fstat(fd, &st) != 0 ) {
+ printf("Couldn't stat %s!\n", argv[2]);
+ exit(-1);
+ }
+
+ tot_size = (u64) st.st_size;
+ printf("Testing VDI %Ld (%Ld bytes).\n", id, tot_size);
+
+ printf(" ");
+ while ( ( count = read(fd, spage, BLOCK_SIZE) ) > 0 ) {
+ u64 gblock = 0;
+
+ gblock = vdi_lookup_block(vdi, vblock, NULL);
+
+ if (gblock == 0) {
+ printf("\n\nfound an unmapped VDI block (%Ld)\n", vblock);
+ exit(0);
+ }
+
+ dpage = readblock(gblock);
+
+ if (memcmp(spage, dpage, BLOCK_SIZE) != 0) {
+ printf("\n\nblocks don't match! (%Ld)\n", vblock);
+ exit(0);
+ }
+
+ freeblock(dpage);
+
+ vblock++;
+ printf("\b\b\b\b\b\b\b\b\b\b\b%011Ld", vblock);
+ fflush(stdout);
+ }
+ printf("\n");
+
+ printf("VDI %Ld looks good!\n", id);
+
+ freeblock(vdi);
+
+ return (0);
+}
# Add a new disk type that will just pass an opaque id in the
# start_sector and use an experimental device type.
# Please contact andrew.warfield@cl.cam.ac.uk with any concerns.
- if self.type == 'amorfs':
+ if self.type == 'parallax':
self.node = node
self.device = 61440 # (240,0)
self.start_sector = long(self.params)